############### ###############
## 05 - CBO individuals cleaning
## Project: CBO
## Author: Kamil Kouhen
## Purpose: Cleaning and management of CBO data at the individual level
## Date of creation: 07/01/2022
############### ###############

library(here)
#Running master file and ad-hoc function rcodes
#source(here("Code", "Rcode", "Master.R"), echo = T) #Master (contains necessary packages)

### Only keeping variables that are useful for CBO project analysis ###
CBO_individuals_intermediate <- CBO_individuals_raw %>%
  select(-scan, -question_yn, -question, -statu, -contains("_00"))

### Missing characters as NA ###
sum(is.na(CBO_individuals_intermediate))
CBO_individuals_intermediate <- CBO_individuals_intermediate %>%
  mutate(across(all_of(CBO_individuals_intermediate %>% select_if(is.character) %>% colnames), ~ifelse(.=="", NA, as.character(.))))
sum

### Id variables as character ###
todrop <- CBO_individuals_intermediate %>%
  select(region, 
         commune, 
         appcode, 
         id_IND) %>% colnames
CBO_individuals_intermediate[todrop] <- lapply(CBO_individuals_intermediate[todrop], as.character)
rm(todrop)

### Checking survey completion ###

  # Note # IPA provided an individual-level dataset that only contains consented observations, making sure it's true
if (sum(CBO_individuals_intermediate$consent != "yes") > 0) stop("There are obs who did not express consent, please check and correct if necessary")

### Ad-hoc function to create report with share of NAs for each variable in dataframe ###
share_NAs(CBO_individuals_intermediate) #File exported in here("Output", "For Cleaning")

### Random check of integrity of dataset ###
if(nrow(CBO_individuals_intermediate) != nrow(CBO_individuals_raw)) stop("Something went wrong: some observations were dropped since upload of CBO_individuals_raw")
if(length(setdiff((CBO_individuals_intermediate %>% group_by(commune) %>% mutate(countn = n()))$countn, (CBO_individuals_raw %>% group_by(commune) %>% mutate(countn = n()))$countn))) stop("Something went wrong: Inconsistent number of obs per commune.")
###                                      ####  

### Looking for non standard missing values (e.g. negative such as -999 or -888 or -97 or -777) ###
if  (length(CBO_individuals_intermediate %>% 
             mutate_all(function(x) Hmisc::all.is.numeric(x, what = "vector", extras = NA)) %>% 
             select_if(is.numeric) %>% 
             select(-ends_with("_BL"), -ends_with("_MON"), -ends_with("_S")) %>% #Not including baseline, monitoring and supermun variables
             keep(~any(as.numeric(.x) <0 & !is.na(.x))) %>% 
             names()) > 0){
  print("Some numeric variables contain values that seem to be non-standard missing (e.g. -999, or -97)")
  
  #Per the codebook, many variables used -999, -888 & -777 to record missing values
  testNA <- CBO_individuals_intermediate %>% 
    mutate_all(function(x) Hmisc::all.is.numeric(x, what = "vector", extras = NA)) %>% 
    select_if(is.numeric) %>% 
    select(-ends_with("_BL"), -ends_with("_MON"), -ends_with("_S")) %>%
    keep(~any(as.numeric(.x) <0 & !is.na(.x))) #Displaying name of columns in this case
  
  unique(testNA[testNA < 0]) #Displaying values in this case
  rm(testNA)
  
    #Note# -1 and -2 values are legitimate, they are part of multi-choice questions
  
  #Replacing all -999, -888 & -97 values by NA for all variables
  CBO_individuals_intermediate %<>%
    mutate(across(all_of(CBO_individuals_intermediate %>% 
                           select(-ends_with("_BL"), -ends_with("_MON"), -ends_with("_S")) %>%
                           select_if(is.character) %>% 
                           colnames), ~ifelse(.=="-999", NA, .))) %>%
    mutate(across(all_of(CBO_individuals_intermediate %>% 
                           select(-ends_with("_BL"), -ends_with("_MON"), -ends_with("_S")) %>%
                           select_if(is.character) %>% 
                           colnames), ~ifelse(.=="-888", NA, .))) %>%
    mutate(across(all_of(CBO_individuals_intermediate %>%
                           select(-ends_with("_BL"), -ends_with("_MON"), -ends_with("_S")) %>%
                           select_if(is.character) %>% 
                           colnames), ~ifelse(.=="-97", NA, .))) %>%
    mutate(across(all_of(CBO_individuals_intermediate %>% 
                          select_if(is.numeric) %>% 
                          select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                          colnames), ~ifelse(.==-999, NA, .))) %>%
    mutate(across(all_of(CBO_individuals_intermediate %>% 
                           select_if(is.numeric) %>% 
                           select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                           colnames), ~ifelse(.==-888, NA, .))) %>%
    mutate(across(all_of(CBO_individuals_intermediate %>% 
                           select_if(is.numeric) %>% 
                           select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                           colnames), ~ifelse(.==-97, NA, .))) %>%
    mutate(across(all_of(CBO_individuals_intermediate %>% 
                           select_if(is.numeric) %>% 
                           select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
                           colnames), ~ifelse(.==-777, NA, .)))
  
  #In case there are "." values
  CBO_individuals_intermediate %<>%
    mutate(across(all_of(CBO_individuals_intermediate %>% select_if(is.character) %>% colnames), ~ifelse(.==".", NA, .)))
  
  #Checking if there are other types of non-standard missing values
  if (length(CBO_individuals_intermediate %>% 
             select_if(is.numeric) %>% 
             select(-ends_with("_BL"), -ends_with("_S"), -ends_with("_MON")) %>%
             keep(~any((.x < 0 & .x != -1 & .x != -2) & !is.na(.x))) %>% names()) != 0){ 
    stop("There still seem to be non-standard missing values, please check before continuing.") #Displaying them
  }
  else{
    print("All non-standard missing values have been identified.")
  }
}

### Identifying variables to be recoded as factor (e.g. if it contains a Yes/No pattern) ###
#Yes/no types
prefactor <- CBO_individuals_intermediate %>% 
  select_if(grepl("yes|no|Yes|No|YES|NO|oui|non|Oui|Non|OUI|NON", CBO_individuals_intermediate)) %>% 
  select_if(function(x) all(max(nchar(x)) < 8)) %>% 
  colnames() ##Identifying variables that are susceptible to be converted as factor (small maximum number of string)
#map(prefactor, unique) #The only non "yes" or "no" string is "dk" for "don't know". I leave them for now before getting's Malte's view on how to deal with these missing values
CBO_individuals_intermediate[prefactor] <- lapply(CBO_individuals_intermediate[prefactor], factor)  ## as.factor() could also be used
rm(prefactor)

#Binary numeric vars (0/1 or 1/2 types)
(binary01_should_be_factor <- CBO_individuals_intermediate %>%
    select_if(is.numeric) %>%
    select_if(function(x) (length(unique(na.omit(x))) == 2)) %>%
    select_if(~max(., na.rm = TRUE) == 1) %>%
    select_if(~min(., na.rm = TRUE) >= 0) %>%
    colnames()) 

(binary12_should_be_factor <- CBO_individuals_intermediate %>% 
    select_if(is.numeric) %>%
    select_if(function(x) (length(unique(na.omit(x))) == 2)) %>%
    select_if(~max(., na.rm = TRUE) == 2) %>%
    select_if(~min(., na.rm = TRUE) >= 1) %>%
    colnames()) 

CBO_individuals_intermediate[binary01_should_be_factor] <- lapply(CBO_individuals_intermediate[binary01_should_be_factor], factor)
CBO_individuals_intermediate[binary12_should_be_factor] <- lapply(CBO_individuals_intermediate[binary12_should_be_factor], factor)

rm(binary01_should_be_factor, binary12_should_be_factor) #Cleaning the environment

#Categ variables in the -2 to 2 scale or any different scale that seems categorical (max below or equal to 2)
(categnum_should_be_factor <- CBO_individuals_intermediate %>% 
    select_if(is.numeric) %>%
    select_if(function(x) (length(unique(na.omit(x))) > 3)) %>%
    select_if(~max(., na.rm = TRUE) <= 2) %>%
    select_if(~min(., na.rm = TRUE) >= -2) %>%
    colnames())

CBO_individuals_intermediate[categnum_should_be_factor] <- lapply(CBO_individuals_intermediate[categnum_should_be_factor], factor)
rm(categnum_should_be_factor)

#Identifying numeric variables that should be categorical (factor) using the questionnaire
IND_questionnaire <- as_tibble(readxl::read_excel(here("Supporting Documents", "IPA deliverables", "3_Questionnaires finaux OCB & DECIDEURS", "OCB_Questionnaire_endline_individual_v2.xlsx"))) #Importing questionnaire
selectvars <- IND_questionnaire %>% 
  filter(if_all(type, ~ grepl('select', .))) %>% 
  filter(!(if_all(type, ~ grepl('phone', .)))) %>%
  select(name)

shouldbefactor <- CBO_individuals_intermediate %>% 
  select_if(is.numeric) %>%
  select(any_of(selectvars$name)) %>%  #Numeric variables that are identified as "select" questions in IPA questionnaire
  colnames

CBO_individuals_intermediate[shouldbefactor] <- lapply(CBO_individuals_intermediate[shouldbefactor], factor)

#Same with character variables (those select_one type vars)
selectvars <- IND_questionnaire %>% 
  filter(if_all(type, ~ grepl('select_one', .))) %>% 
  select(name)

shouldbefactor <- CBO_individuals_intermediate %>% 
  select_if(is.character) %>%
  select(any_of(selectvars$name)) %>%  #Numeric variables that are identified as "select" questions in IPA questionnaire
  colnames

CBO_individuals_intermediate[shouldbefactor] <- lapply(CBO_individuals_intermediate[shouldbefactor], factor)

rm(shouldbefactor, selectvars)

#Writing yes as "1", no as "0", and dk as "Don't Know"
#I have to do it one by one (to only select variables containing the specific level to change, couldn't find how to do it otherwise)
tochange <- CBO_individuals_intermediate %>% select_if(~ is.factor(.) && any(c("yes") %in% levels(.))) %>% colnames()
CBO_individuals_intermediate <- CBO_individuals_intermediate %>% 
  mutate_at(.vars = vars(all_of(tochange)),
            .funs = forcats::fct_recode,
            "1" = "yes") 

tochange <- CBO_individuals_intermediate %>% select_if(~ is.factor(.) && any(c("no") %in% levels(.))) %>% colnames()
CBO_individuals_intermediate <- CBO_individuals_intermediate %>% 
  mutate_at(.vars = vars(all_of(tochange)),
            .funs = forcats::fct_recode,
            "0" = "no") 

rm(tochange)

### Random check of integrity of dataset ###
if(nrow(CBO_individuals_intermediate) != nrow(CBO_individuals_raw)) stop("Something went wrong: some observations were dropped since upload of CBO_individuals_raw")
if(length(setdiff((CBO_individuals_intermediate %>% group_by(commune) %>% mutate(countn = n()))$countn, (CBO_individuals_raw %>% group_by(commune) %>% mutate(countn = n()))$countn))) stop("Something went wrong: Inconsistent number of obs per commune.")
###                                      ####  

### Using questionnaire to label variables ###
varlabels <- as_tibble(readxl::read_excel(here("Supporting Documents", "IPA deliverables","3_Questionnaires finaux OCB & DECIDEURS", "OCB_Questionnaire_endline_individual_v2.xlsx"), sheet = "survey")) %>% #Importing questionnaire
  select(name, "label::English") %>%
  rename(label = "label::English") %>%
  filter(name %in% colnames(CBO_individuals_intermediate))

varlabels <- as_tibble(cbind(nms = names(varlabels), t(varlabels))) %>%  #Reshaping: the goal is to have a table with a column for each variable and the first row being the label. 
  janitor::row_to_names(row_number = 1) %>% #First row as variable name
  select(-name)

  # Note # Please ignore the following warning (should stop after first run, this warning is displayed once every 8 hours): The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.

CBO_individuals_intermediate <- Hmisc::upData(CBO_individuals_intermediate, labels = varlabels) #It worked (variables are labelled)
rm(varlabels)

### Checking if some observations have too many missing values ###
CBO_individuals_intermediate <- CBO_individuals_intermediate %>%
  mutate(todrop_nb.NA = rowSums(is.na(.))) %>% 
  mutate(todrop_share.NA.analysis = todrop_nb.NA/(length(CBO_individuals_intermediate)))
sumstats(CBO_individuals_intermediate$todrop_nb.NA) #User-written function for simple summary stats
sumstats(CBO_individuals_intermediate$todrop_share.NA.analysis) #User-written function for simple summary stats

if (nrow(CBO_individuals_intermediate %>% filter(todrop_share.NA.analysis > 0.70)) != 0) stop("There are observation with more than 70% of columns with missing values, please check and assess.")

CBO_individuals_intermediate <- CBO_individuals_intermediate %>% 
  select(-contains("todrop_"))

### Saving intermediate (pre-preparation for analysis) cleaned IND dataset ###
saveRDS(CBO_individuals_intermediate, file = here(datatype, "Intermediate", "CBO_individuals_intermediate.RDS"))

message("**05 completed")

